if (interactive() && !str_ends(getwd(), "R/stat_anal/CITY_US")) {
setwd("R/stat_anal/CITY_US")
}
data <- read_excel("CITY_shortname.xls")
data[data == "NA"] <- NA
data[, -(1:2)] <- data.frame(lapply(data[, -(1:2)], as.numeric))
fullnames <- names(read_excel("CITY.xls"))
print(fullnames)
## [1] "CITY City"
## [2] "STATE State"
## [3] "AREA Land area, 1990 (sq.miles)"
## [4] "R_AREA Land area, 1990 (sq.miles), ranks"
## [5] "POP92 Population,1992"
## [6] "R_POP92 Population,1992, ranks"
## [7] "POP80 Population,1980"
## [8] "R_POP80 Population,1980, ranks"
## [9] "POP_CH Population growth rate,1980-1992"
## [10] "R_POP_CH Population growth rate,1980-1992, ranks"
## [11] "POPDEN Population per square mile, 1992"
## [12] "R_POPDEN Population per square mile, 1992, ranks"
## [13] "OLD % older 65 years"
## [14] "R_OLD % older 65 years, ranks"
## [15] "BLACK Black population,1990"
## [16] "R_BLACK Black population,1990, ranks"
## [17] "BLACK% % Black population,1990"
## [18] "R_BLACK% % Black population,1990, ranks"
## [19] "ASIAN Asian or Pacific Islander population,1990"
## [20] "R_ASIAN Asian or Pacific Islander population,1990, ranks"
## [21] "ASIAN% % Asian or Pacific Islander population,1990"
## [22] "R_ASIAN% % Asian or Pacific Islander population,1990, ranks"
## [23] "HISP Hispanic population, 1990"
## [24] "R_HISP Hispanic population, 1990, ranks"
## [25] "HISP% % Hispanic population, 1990"
## [26] "R_HISP% % Hispanic population, 1990, ranks"
## [27] "BORN_F % persons of foreign born"
## [28] "R_BORN_F % persons of foreign born, ranks"
## [29] "LANG % of persons, speaking language other than English at home, 1990"
## [30] "R_LANG % of persons, speaking language other than English at home, 1990, ranks"
## [31] "HH1 % 1-person households, 1990"
## [32] "R_HH1 % 1-person households, 1990, ranks"
## [33] "FAMIL1 % one-parent family households, 1990"
## [34] "R_FAMIL1 % one-parent family households, 1990, ranks"
## [35] "DEATH Infant death rate per 1000 live births,1988"
## [36] "R_DEATH Infant death rate per 1000 live births,1988, ranks"
## [37] "CRIME Crime rate per 100000 population, 1991"
## [38] "R_CRIME Crime rate per 100000 population, 1991, rate"
## [39] "SCHOOL % of elementary and high school enrollment in public schools, 1990"
## [40] "R_SCHOOL % of elementary and high school enrollment in public schools, 1990, ranks"
## [41] "DEGREE % of adults with a bachelor's degree or higher, 1990"
## [42] "R_DEGREE % of adults with a bachelor's degree or higher, 1990, ranks"
## [43] "INCOME Median household income, 1989"
## [44] "R_INCOME Median household income, 1989, ranks"
## [45] "ASSIST % of households, receiving public assistance,1989"
## [46] "R_ASSIST % of households, receiving public assistance,1989, ranks"
## [47] "POVERT % of persons below poverty level, 1989"
## [48] "R_POVERT % of persons below poverty level, 1989, ranks"
## [49] "OLB_BIL % of housing units built 1939 or earlier, 1990"
## [50] "R_OLD_BI % of housing units built 1939 or earlier, 1990, ranks"
## [51] "OWNER Median value of specified owner-occupied housing units ($), 1990"
## [52] "R_OWNER Median value of specified owner-occupied housing units ($), 1990, ranks"
## [53] "RENTER % renter-occupied housing units,1990"
## [54] "R_RENTER % renter-occupied housing units,1990, ranks"
## [55] "GROSS Median gross rent of specified renter-occupied housing units ($), 1990"
## [56] "R_GROSS Median gross rent of specified renter-occupied housing units ($), 1990, ranks"
## [57] "CONDOM % condominimum occupied housing units, 1990"
## [58] "R_CONDOM % condominimum occupied housing units, 1990, ranks"
## [59] "TRANSP % of workers using public transportation"
## [60] "R_TRANSP % of workers using public transportation, ranks"
## [61] "UNEMP Unemployment rate, 1991"
## [62] "R_UNEMP Unemployment rate, 1991, ranks"
## [63] "LABOR Labor force - % change 1980-1990"
## [64] "R_LABOR Labor force - % change 1980-1990, ranks"
## [65] "LAB_F Female civilian labor force participation rate, 1990"
## [66] "R_LAB_F Female civilian labor force participation rate, 1990, ranks"
## [67] "MANLAB % of civilian labor force emploied in manufacturing, 1990"
## [68] "R_MANLAB % of civilian labor force emploied in manufacturing, 1990, ranks"
## [69] "TAXE City government taxes per capita, 1991"
## [70] "R_TAXE City government taxes per capita, 1991, ranks"
## [71] "TEMPER Average daily temperature in July"
## [72] "R_TEMPER Average daily temperature in July, ranks"
## [73] "PRECEP Annual precipitation"
## [74] "R_PRECEP Annual precipitation"
names_interesting <- c("AREA", "POP80", "POP92", "POPDEN", "CRIME", "BORN_F", "POVERT", "INCOME", "UNEMP", "TEMPER")
data <- data %>% select(all_of(c("CITY", "STATE", names_interesting)))
print(head(data))
## # A tibble: 6 × 12
## CITY STATE AREA POP80 POP92 POPDEN CRIME BORN_F POVERT INCOME UNEMP TEMPER
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 NEW_… NY 309. 7.07e6 7.31e6 23671 9236 28.4 19.3 29823 8.6 76.8
## 2 LOS_… CA 469. 2.97e6 3.49e6 7436 9730 38.4 18.9 30925 9 74.3
## 3 CHIC… IL 227. 3.01e6 2.77e6 12185 NA 16.9 21.6 26301 8.4 75.1
## 4 HOUS… TX 540. 1.60e6 1.69e6 3131 10824 17.8 20.7 26261 6.1 83.5
## 5 PHIL… PA 135. 1.69e6 1.55e6 11492 6835 6.6 20.3 24603 8 76.7
## 6 SAN_… CA 324 8.76e5 1.15e6 3546 8537 20.9 13.4 10 6.2 71
Город и штат качественные, остальные количественные, ранги были порядковыми.
find_mode_freq <- function(x) {
x <- x[!is.na(x)]
return(max(tabulate(match(x, x))))
}
print(data %>% summarise(across(
all_of(names_interesting),
find_mode_freq
)))
## # A tibble: 1 × 10
## AREA POP80 POP92 POPDEN CRIME BORN_F POVERT INCOME UNEMP TEMPER
## <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
## 1 1 1 1 2 1 4 2 1 5 3
print(sort(data$UNEMP))
## [1] 2.3 3.2 3.8 3.9 4.1 4.4 4.6 4.7 4.7 4.8 4.8 5.0 5.0 5.0 5.0
## [16] 5.1 5.1 5.1 5.3 5.4 5.4 5.4 5.4 5.4 5.6 5.7 5.9 5.9 5.9 6.0
## [31] 6.0 6.1 6.1 6.1 6.1 6.2 6.2 6.4 6.4 6.5 6.7 6.7 6.7 6.8 6.9
## [46] 6.9 7.0 7.1 7.2 7.2 7.3 7.3 7.3 7.5 7.6 7.7 7.7 7.7 8.0 8.1
## [61] 8.4 8.4 8.5 8.6 9.0 9.0 9.4 9.5 9.6 10.0 10.4 10.6 10.7 11.0 11.9
## [76] 12.6 13.1
Все количественные буду считать непрерывными. возможно UNEMP непрерывный с плохой точностью.
if (interactive()) pdf("ggpairs_unedited.pdf")
ggpairs(
data[, -(1:2)],
lower = list(continuous = wrap("points", alpha = 0.5, size = 0.3)),
diag = list(continuous = "barDiag")
)
if (interactive()) dev.off()
Убираю outliers:
Помечаю некорректные данные в INCOME как NA. Удаляю город из Аляски за плотность населения. Флорида выделсется на BORN_F-INCOME Гаваи выделяются низким уровнем безработицы. странно?
data$INCOME[data$INCOME < 100] <- NA
data <- data %>% filter(STATE != "AK")
Функция, которая логарифмирует, если это сделает выборку симметричнее
log_asymmetric <- function(x) {
if (skewness(x, na.rm = TRUE) < abs(skewness(log(x), na.rm = TRUE))) {
print("default")
return(x)
} else {
print("logged")
return(log(x))
}
}
Автоматически логарифмирую то что имеет асимметрию и длинный хвост справа
data_logged <- data %>%
mutate(across(all_of(names_interesting), log_asymmetric))
## [1] "logged"
## [1] "logged"
## [1] "logged"
## [1] "logged"
## [1] "logged"
## [1] "logged"
## [1] "default"
## [1] "default"
## [1] "logged"
## [1] "default"
if (interactive()) pdf("ggpairs_logged.pdf")
ggpairs(
data_logged[, -(1:2)],
lower = list(continuous = wrap("points", alpha = 0.5, size = 0.3)),
diag = list(continuous = "barDiag")
)
if (interactive()) dev.off()
выглядит однородно.
print_characteristics <- function(x) {
list(
mean = mean(x, na.rm = TRUE),
var = var(x, na.rm = TRUE),
skewness = skewness(x, na.rm = TRUE),
kurtosis = kurtosis(x, na.rm = TRUE) - 3
)
}
sapply(data_logged %>% select(all_of
(names_interesting)), print_characteristics)
## AREA POP80 POP92 POPDEN CRIME BORN_F
## mean 4.754638 12.9069 13.01226 8.257586 9.206558 1.959747
## var 0.6658984 0.5142076 0.4464288 0.5099096 0.06798296 0.8435754
## skewness 0.1317646 1.453879 1.743773 0.1015321 0.1476689 0.3081197
## kurtosis -0.3907363 3.033026 3.885752 -0.1531417 0.08872069 -0.7431516
## POVERT INCOME UNEMP TEMPER
## mean 18.11842 25282.48 1.873674 77.60132
## var 34.68872 14321837 0.09967048 37.59853
## skewness 0.2243883 -0.2196584 -0.2186434 -0.1426746
## kurtosis -0.3539569 -0.6467894 0.6594639 0.7472063
df <- data_logged %>%
select(-CITY, -STATE) %>%
drop_na()
ggplot(melt(cor(df))) +
geom_raster(aes(x = Var2, y = Var1, fill = value)) +
geom_text(aes(x = Var2, y = Var1, label = round(value, 2))) +
scale_fill_gradient2() +
theme_dark() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
df_names <- data_logged %>%
drop_na()
df <- data_logged %>%
select(-CITY, -STATE) %>%
drop_na()
model_ <- lm(INCOME ~ ., data = df, na.action = na.exclude)
model <- lm.beta(model_)
# model.beta <- lm.beta(model)
# summary(model)
summary(model)
##
## Call:
## lm(formula = INCOME ~ ., data = df, na.action = na.exclude)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4034.1 -906.6 70.0 660.6 3604.2
##
## Coefficients:
## Estimate Standardized Std. Error t value Pr(>|t|)
## (Intercept) 1.466e+04 NA 1.239e+04 1.183 0.2417
## AREA -5.795e+05 -1.253e+02 1.582e+06 -0.366 0.7155
## POP80 -3.930e+03 -7.054e-01 1.924e+03 -2.043 0.0459 *
## POP92 5.845e+05 9.995e+01 1.582e+06 0.369 0.7132
## POPDEN -5.790e+05 -1.077e+02 1.582e+06 -0.366 0.7157
## CRIME 3.988e+02 2.530e-02 1.094e+03 0.365 0.7167
## BORN_F 5.286e+02 1.200e-01 4.056e+02 1.303 0.1979
## POVERT -6.005e+02 -8.420e-01 6.116e+01 -9.818 1.07e-13 ***
## UNEMP 1.243e+03 9.800e-02 1.021e+03 1.217 0.2289
## TEMPER -3.654e+01 -5.632e-02 4.573e+01 -0.799 0.4277
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1711 on 55 degrees of freedom
## Multiple R-squared: 0.8269, Adjusted R-squared: 0.7986
## F-statistic: 29.19 on 9 and 55 DF, p-value: < 2.2e-16
ggplot(melt(cov2cor(vcov(model)))) +
geom_raster(aes(x = Var2, y = Var1, fill = value)) +
geom_text(aes(x = Var2, y = Var1, label = round(value, 2))) +
scale_fill_gradient2() +
theme_dark() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
# cov2cor(vcov(model))
hist(residuals(model))
plot(fitted(model), residuals(model))
abline(h = 0, lty = 2)
жаль
есть население в 80 и 92 и другие зависимые
outliers уже убраны, если какие-то и остались, это не так критично
На примере пары признаков строите двумерный доверительный интервал для пары значащих коэффициентов регрессии, интерпретируете: (1) оба признака влияют на результат согласно оценкам коэффициентов регрессии перед ними: или (2) признаки вместе сильно влияют, но не различить, какой из них больше; или (3) непонятно, или оба признака слабо влияют, или оба влияют сильно.
plot_ellipse <- function(model_, id) {
# confidenceEllipse(lm.beta(model_), which.coef = id, vcov. = function(x) cov2cor(vcov(x)))
# points(0, 0, col = "red", pch = 19)
# lines(x = c(0, coef(lm.beta(model_))[id[1]]), c(0, coef(lm.beta(model_))[id[2]]), col = "red")
confidenceEllipse(model_, which.coef = id)
points(0, 0, col = "red", pch = 19)
lines(x = c(0, coef(model_)[id[1]]), c(0, coef(model_)[id[2]]), col = "red")
}
plot_ellipse(model_, c(3, 6))
plot_ellipse(model_, c(3, 7))
# plot_ellipse(model, c(3, 8))
# plot_ellipse(model, c(3, 9))
# plot_ellipse(model, c(3, 10))
plot_ellipse(model_, c(4, 5))
plot_ellipse(model_, c(6, 7))
Сделаем это вручную на основе таблицы Redundancy. Там «независимые переменные» сравниваются по двум критериям - независимость от других «независимых» признаков и зависимость от зависимой переменной. Объясняете, что означают столбцы, что делать, если эти критерии дают противоречивые рекомендации, решаете, какой признак лучше убрать первым.
ols_vif_tol(model_)
## Variables Tolerance VIF
## 1 AREA 2.690263e-08 3.717108e+07
## 2 POP80 2.639800e-02 3.788165e+01
## 3 POP92 4.299229e-08 2.325998e+07
## 4 POPDEN 3.634021e-08 2.751773e+07
## 5 CRIME 6.539582e-01 1.529150e+00
## 6 BORN_F 3.710426e-01 2.695108e+00
## 7 POVERT 4.278842e-01 2.337081e+00
## 8 UNEMP 4.850860e-01 2.061490e+00
## 9 TEMPER 6.334253e-01 1.578718e+00
3 маленьких значения у AREA, POP92, POPDEN, потому что они зависят друг от друга
ols_correlations(model_)
## Correlations
## -------------------------------------------
## Variable Zero Order Partial Part
## -------------------------------------------
## AREA 0.262 -0.049 -0.021
## POP80 0.045 -0.266 -0.115
## POP92 0.211 0.050 0.021
## POPDEN -0.111 -0.049 -0.021
## CRIME -0.299 0.049 0.020
## BORN_F 0.286 0.173 0.073
## POVERT -0.826 -0.798 -0.551
## UNEMP -0.335 0.162 0.068
## TEMPER 0.047 -0.107 -0.045
## -------------------------------------------
model_manual <- lm(INCOME ~ ., data = df, na.action = na.exclude)
summary(model_manual)
##
## Call:
## lm(formula = INCOME ~ ., data = df, na.action = na.exclude)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4034.1 -906.6 70.0 660.6 3604.2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14662.88 12389.85 1.183 0.2417
## AREA -579530.59 1581975.16 -0.366 0.7155
## POP80 -3929.98 1923.64 -2.043 0.0459 *
## POP92 584511.90 1582224.00 0.369 0.7132
## POPDEN -579044.47 1581758.68 -0.366 0.7157
## CRIME 398.81 1093.51 0.365 0.7167
## BORN_F 528.57 405.55 1.303 0.1979
## POVERT -600.45 61.16 -9.818 1.07e-13 ***
## UNEMP 1242.52 1021.20 1.217 0.2289
## TEMPER -36.54 45.73 -0.799 0.4277
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1711 on 55 degrees of freedom
## Multiple R-squared: 0.8269, Adjusted R-squared: 0.7986
## F-statistic: 29.19 on 9 and 55 DF, p-value: < 2.2e-16
ols_vif_tol(model_manual)
## Variables Tolerance VIF
## 1 AREA 2.690263e-08 3.717108e+07
## 2 POP80 2.639800e-02 3.788165e+01
## 3 POP92 4.299229e-08 2.325998e+07
## 4 POPDEN 3.634021e-08 2.751773e+07
## 5 CRIME 6.539582e-01 1.529150e+00
## 6 BORN_F 3.710426e-01 2.695108e+00
## 7 POVERT 4.278842e-01 2.337081e+00
## 8 UNEMP 4.850860e-01 2.061490e+00
## 9 TEMPER 6.334253e-01 1.578718e+00
ols_correlations(model_manual)
## Correlations
## -------------------------------------------
## Variable Zero Order Partial Part
## -------------------------------------------
## AREA 0.262 -0.049 -0.021
## POP80 0.045 -0.266 -0.115
## POP92 0.211 0.050 0.021
## POPDEN -0.111 -0.049 -0.021
## CRIME -0.299 0.049 0.020
## BORN_F 0.286 0.173 0.073
## POVERT -0.826 -0.798 -0.551
## UNEMP -0.335 0.162 0.068
## TEMPER 0.047 -0.107 -0.045
## -------------------------------------------
AIC(model_manual)
## [1] 1163.408
удаляю AREA
model_manual <- lm(INCOME ~ POP80 + POP92 + POPDEN + CRIME + BORN_F + POVERT + UNEMP + TEMPER, data = df, na.action = na.exclude)
summary(model_manual)
##
## Call:
## lm(formula = INCOME ~ POP80 + POP92 + POPDEN + CRIME + BORN_F +
## POVERT + UNEMP + TEMPER, data = df, na.action = na.exclude)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4102.8 -947.0 -1.4 627.2 3638.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 15255.24 12188.55 1.252 0.2159
## POP80 -3833.72 1890.82 -2.028 0.0474 *
## POP92 4890.61 1998.30 2.447 0.0176 *
## POPDEN 406.80 473.73 0.859 0.3942
## CRIME 418.98 1083.65 0.387 0.7005
## BORN_F 542.00 400.76 1.352 0.1817
## POVERT -603.32 60.18 -10.025 4.18e-14 ***
## UNEMP 1293.58 1003.79 1.289 0.2028
## TEMPER -39.51 44.65 -0.885 0.3800
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1697 on 56 degrees of freedom
## Multiple R-squared: 0.8265, Adjusted R-squared: 0.8017
## F-statistic: 33.34 on 8 and 56 DF, p-value: < 2.2e-16
ols_vif_tol(model_manual)
## Variables Tolerance VIF
## 1 POP80 0.02689994 37.174806
## 2 POP92 0.02653606 37.684568
## 3 POPDEN 0.39887584 2.507046
## 4 CRIME 0.65561925 1.525276
## 5 BORN_F 0.37409814 2.673095
## 6 POVERT 0.43501937 2.298748
## 7 UNEMP 0.49429487 2.023084
## 8 TEMPER 0.65399873 1.529055
ols_correlations(model_manual)
## Correlations
## -------------------------------------------
## Variable Zero Order Partial Part
## -------------------------------------------
## POP80 0.045 -0.262 -0.113
## POP92 0.211 0.311 0.136
## POPDEN -0.111 0.114 0.048
## CRIME -0.299 0.052 0.022
## BORN_F 0.286 0.178 0.075
## POVERT -0.826 -0.801 -0.558
## UNEMP -0.335 0.170 0.072
## TEMPER 0.047 -0.117 -0.049
## -------------------------------------------
AIC(model_manual)
## [1] 1161.566
удаляю CRIME
model_manual <- lm(INCOME ~ POP80 + POP92 + POPDEN + BORN_F + POVERT + UNEMP + TEMPER, data = df, na.action = na.exclude)
summary(model_manual)
##
## Call:
## lm(formula = INCOME ~ POP80 + POP92 + POPDEN + BORN_F + POVERT +
## UNEMP + TEMPER, data = df, na.action = na.exclude)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4105.7 -946.6 14.1 581.3 3649.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19259.27 6379.30 3.019 0.00379 **
## POP80 -3646.34 1813.97 -2.010 0.04916 *
## POP92 4684.82 1911.69 2.451 0.01735 *
## POPDEN 352.48 449.03 0.785 0.43572
## BORN_F 595.95 372.87 1.598 0.11551
## POVERT -598.32 58.34 -10.256 1.48e-14 ***
## UNEMP 1352.77 984.62 1.374 0.17485
## TEMPER -36.11 43.45 -0.831 0.40942
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1685 on 57 degrees of freedom
## Multiple R-squared: 0.826, Adjusted R-squared: 0.8047
## F-statistic: 38.66 on 7 and 57 DF, p-value: < 2.2e-16
ols_vif_tol(model_manual)
## Variables Tolerance VIF
## 1 POP80 0.02879141 34.732583
## 2 POP92 0.02856256 35.010866
## 3 POPDEN 0.43733418 2.286581
## 4 BORN_F 0.42570551 2.349042
## 5 POVERT 0.45606288 2.192680
## 6 UNEMP 0.50606495 1.976031
## 7 TEMPER 0.68039825 1.469727
ols_correlations(model_manual)
## Correlations
## -------------------------------------------
## Variable Zero Order Partial Part
## -------------------------------------------
## POP80 0.045 -0.257 -0.111
## POP92 0.211 0.309 0.135
## POPDEN -0.111 0.103 0.043
## BORN_F 0.286 0.207 0.088
## POVERT -0.826 -0.805 -0.567
## UNEMP -0.335 0.179 0.076
## TEMPER 0.047 -0.109 -0.046
## -------------------------------------------
AIC(model_manual)
## [1] 1159.739
удаляю POPDEN
model_manual <- lm(INCOME ~ POP80 + POP92 + BORN_F + POVERT + UNEMP + TEMPER, data = df, na.action = na.exclude)
summary(model_manual)
##
## Call:
## lm(formula = INCOME ~ POP80 + POP92 + BORN_F + POVERT + UNEMP +
## TEMPER, data = df, na.action = na.exclude)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4073.7 -1009.9 52.5 645.9 3836.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22076.12 5256.84 4.200 9.32e-05 ***
## POP80 -3187.60 1711.56 -1.862 0.0676 .
## POP92 4256.03 1825.91 2.331 0.0233 *
## BORN_F 743.65 320.84 2.318 0.0240 *
## POVERT -595.53 58.04 -10.261 1.18e-14 ***
## UNEMP 1464.30 971.09 1.508 0.1370
## TEMPER -46.39 41.29 -1.123 0.2659
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1679 on 58 degrees of freedom
## Multiple R-squared: 0.8241, Adjusted R-squared: 0.8059
## F-statistic: 45.3 on 6 and 58 DF, p-value: < 2.2e-16
ols_vif_tol(model_manual)
## Variables Tolerance VIF
## 1 POP80 0.03212584 31.127588
## 2 POP92 0.03110197 32.152302
## 3 BORN_F 0.57115395 1.750841
## 4 POVERT 0.45776464 2.184529
## 5 UNEMP 0.51682543 1.934889
## 6 TEMPER 0.74838068 1.336218
ols_correlations(model_manual)
## Correlations
## -------------------------------------------
## Variable Zero Order Partial Part
## -------------------------------------------
## POP80 0.045 -0.238 -0.103
## POP92 0.211 0.293 0.128
## BORN_F 0.286 0.291 0.128
## POVERT -0.826 -0.803 -0.565
## UNEMP -0.335 0.194 0.083
## TEMPER 0.047 -0.146 -0.062
## -------------------------------------------
AIC(model_manual)
## [1] 1158.438
удаляю POP80
model_manual <- lm(INCOME ~ POP92 + BORN_F + POVERT + UNEMP + TEMPER, data = df, na.action = na.exclude)
summary(model_manual)
##
## Call:
## lm(formula = INCOME ~ POP92 + BORN_F + POVERT + UNEMP + TEMPER,
## data = df, na.action = na.exclude)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4987.0 -988.0 -6.1 968.8 3891.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22022.637 5365.601 4.104 0.000126 ***
## POP92 915.530 348.732 2.625 0.011008 *
## BORN_F 1031.022 287.127 3.591 0.000672 ***
## POVERT -647.969 51.798 -12.510 < 2e-16 ***
## UNEMP 1375.190 989.993 1.389 0.170026
## TEMPER -8.666 36.730 -0.236 0.814289
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1714 on 59 degrees of freedom
## Multiple R-squared: 0.8136, Adjusted R-squared: 0.7978
## F-statistic: 51.51 on 5 and 59 DF, p-value: < 2.2e-16
ols_vif_tol(model_manual)
## Variables Tolerance VIF
## 1 POP92 0.8883098 1.125733
## 2 BORN_F 0.7430051 1.345886
## 3 POVERT 0.5986938 1.670303
## 4 UNEMP 0.5180831 1.930192
## 5 TEMPER 0.9855016 1.014712
ols_correlations(model_manual)
## Correlations
## -------------------------------------------
## Variable Zero Order Partial Part
## -------------------------------------------
## POP92 0.211 0.323 0.148
## BORN_F 0.286 0.423 0.202
## POVERT -0.826 -0.852 -0.703
## UNEMP -0.335 0.178 0.078
## TEMPER 0.047 -0.031 -0.013
## -------------------------------------------
AIC(model_manual)
## [1] 1160.214
удаляю TEMPER
model_manual <- lm(INCOME ~ POP92 + BORN_F + POVERT + UNEMP, data = df, na.action = na.exclude)
summary(model_manual)
##
## Call:
## lm(formula = INCOME ~ POP92 + BORN_F + POVERT + UNEMP, data = df,
## na.action = na.exclude)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5032.6 -892.2 38.5 927.5 3894.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 21361.49 4539.65 4.706 1.53e-05 ***
## POP92 911.47 345.55 2.638 0.010615 *
## BORN_F 1030.40 284.85 3.617 0.000612 ***
## POVERT -647.85 51.39 -12.607 < 2e-16 ***
## UNEMP 1393.66 979.10 1.423 0.159795
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1700 on 60 degrees of freedom
## Multiple R-squared: 0.8134, Adjusted R-squared: 0.801
## F-statistic: 65.41 on 4 and 60 DF, p-value: < 2.2e-16
ols_vif_tol(model_manual)
## Variables Tolerance VIF
## 1 POP92 0.8904814 1.122988
## 2 BORN_F 0.7430677 1.345772
## 3 POVERT 0.5987465 1.670156
## 4 UNEMP 0.5213435 1.918121
ols_correlations(model_manual)
## Correlations
## -------------------------------------------
## Variable Zero Order Partial Part
## -------------------------------------------
## POP92 0.211 0.322 0.147
## BORN_F 0.286 0.423 0.202
## POVERT -0.826 -0.852 -0.703
## UNEMP -0.335 0.181 0.079
## -------------------------------------------
AIC(model_manual)
## [1] 1158.275
все
model.start <- lm(INCOME ~ 1, data = df, na.action = na.exclude)
model.end <- model_
stepAIC(model.start, direction = "forward", scope = list(lower = model.start, upper = model.end))
## Start: AIC=1072.95
## INCOME ~ 1
##
## Df Sum of Sq RSS AIC
## + POVERT 1 634322612 295543462 1000.5
## + UNEMP 1 104215646 825650428 1067.2
## + CRIME 1 83098306 846767768 1068.9
## + BORN_F 1 76089645 853776429 1069.4
## + AREA 1 63971971 865894103 1070.3
## + POP92 1 41536695 888329379 1072.0
## <none> 929866074 1073.0
## + POPDEN 1 11367143 918498931 1074.2
## + TEMPER 1 2081008 927785066 1074.8
## + POP80 1 1876520 927989554 1074.8
##
## Step: AIC=1000.45
## INCOME ~ POVERT
##
## Df Sum of Sq RSS AIC
## + BORN_F 1 96401932 199141529 976.78
## + POP92 1 55215679 240327782 989.00
## + UNEMP 1 39526349 256017113 993.11
## + POPDEN 1 37440832 258102630 993.64
## + POP80 1 35237137 260306325 994.19
## <none> 295543462 1000.45
## + CRIME 1 1831380 293712082 1002.04
## + AREA 1 1002737 294540725 1002.23
## + TEMPER 1 210895 295332567 1002.40
##
## Step: AIC=976.78
## INCOME ~ POVERT + BORN_F
##
## Df Sum of Sq RSS AIC
## + POP92 1 19814961 179326568 971.97
## + POP80 1 14807873 184333657 973.76
## + AREA 1 6121204 193020326 976.75
## <none> 199141529 976.78
## + UNEMP 1 5557846 193583683 976.94
## + POPDEN 1 1958416 197183113 978.14
## + CRIME 1 396514 198745015 978.65
## + TEMPER 1 136666 199004864 978.74
##
## Step: AIC=971.97
## INCOME ~ POVERT + BORN_F + POP92
##
## Df Sum of Sq RSS AIC
## + UNEMP 1 5857802 173468766 971.81
## <none> 179326568 971.97
## + POP80 1 5384243 173942325 971.99
## + AREA 1 660704 178665864 973.73
## + POPDEN 1 660499 178666070 973.73
## + TEMPER 1 353458 178973110 973.84
## + CRIME 1 22031 179304537 973.96
##
## Step: AIC=971.81
## INCOME ~ POVERT + BORN_F + POP92 + UNEMP
##
## Df Sum of Sq RSS AIC
## + POP80 1 6384425 167084341 971.38
## <none> 173468766 971.81
## + AREA 1 163549 173305216 973.75
## + TEMPER 1 163530 173305235 973.75
## + POPDEN 1 163512 173305254 973.75
## + CRIME 1 136502 173332264 973.76
##
## Step: AIC=971.38
## INCOME ~ POVERT + BORN_F + POP92 + UNEMP + POP80
##
## Df Sum of Sq RSS AIC
## <none> 167084341 971.38
## + TEMPER 1 3558242 163526099 971.98
## + AREA 1 3347905 163736436 972.06
## + POPDEN 1 3346997 163737344 972.06
## + CRIME 1 95275 166989066 973.34
##
## Call:
## lm(formula = INCOME ~ POVERT + BORN_F + POP92 + UNEMP + POP80,
## data = df, na.action = na.exclude)
##
## Coefficients:
## (Intercept) POVERT BORN_F POP92 UNEMP POP80
## 19372.8 -610.6 826.2 3251.1 1513.0 -2244.4
stepAIC(model_, direction = "backward")
## Start: AIC=976.95
## INCOME ~ AREA + POP80 + POP92 + POPDEN + CRIME + BORN_F + POVERT +
## UNEMP + TEMPER
##
## Df Sum of Sq RSS AIC
## - CRIME 1 389253 161343069 975.10
## - POPDEN 1 392177 161345993 975.10
## - AREA 1 392728 161346544 975.10
## - POP92 1 399383 161353199 975.11
## - TEMPER 1 1868458 162822274 975.70
## - UNEMP 1 4332343 165286159 976.67
## - BORN_F 1 4971076 165924892 976.92
## <none> 160953816 976.95
## - POP80 1 12214364 173168180 979.70
## - POVERT 1 282102879 443056695 1040.76
##
## Step: AIC=975.1
## INCOME ~ AREA + POP80 + POP92 + POPDEN + BORN_F + POVERT + UNEMP +
## TEMPER
##
## Df Sum of Sq RSS AIC
## - POPDEN 1 433672 161776741 973.28
## - AREA 1 434175 161777244 973.28
## - POP92 1 440883 161783952 973.28
## - TEMPER 1 1604802 162947871 973.75
## - UNEMP 1 4814311 166157380 975.01
## <none> 161343069 975.10
## - BORN_F 1 6756546 168099615 975.77
## - POP80 1 11885594 173228663 977.72
## - POVERT 1 291537007 452880076 1040.19
##
## Step: AIC=973.28
## INCOME ~ AREA + POP80 + POP92 + BORN_F + POVERT + UNEMP + TEMPER
##
## Df Sum of Sq RSS AIC
## - AREA 1 1749358 163526099 971.98
## - TEMPER 1 1959695 163736436 972.06
## <none> 161776741 973.28
## - UNEMP 1 5356914 167133655 973.39
## - BORN_F 1 7249116 169025857 974.13
## - POP80 1 11468717 173245458 975.73
## - POP92 1 16568534 178345275 977.61
## - POVERT 1 298552628 460329369 1039.25
##
## Step: AIC=971.98
## INCOME ~ POP80 + POP92 + BORN_F + POVERT + UNEMP + TEMPER
##
## Df Sum of Sq RSS AIC
## - TEMPER 1 3558242 167084341 971.38
## <none> 163526099 971.98
## - UNEMP 1 6410585 169936684 972.48
## - POP80 1 9779136 173305235 973.75
## - BORN_F 1 15146452 178672551 975.73
## - POP92 1 15318234 178844333 975.80
## - POVERT 1 296876293 460402392 1037.26
##
## Step: AIC=971.38
## INCOME ~ POP80 + POP92 + BORN_F + POVERT + UNEMP
##
## Df Sum of Sq RSS AIC
## <none> 167084341 971.38
## - POP80 1 6384425 173468766 971.81
## - UNEMP 1 6857984 173942325 971.99
## - POP92 1 11761265 178845606 973.80
## - BORN_F 1 19727125 186811466 976.63
## - POVERT 1 329636555 496720896 1040.19
##
## Call:
## lm(formula = INCOME ~ POP80 + POP92 + BORN_F + POVERT + UNEMP,
## data = df, na.action = na.exclude)
##
## Coefficients:
## (Intercept) POP80 POP92 BORN_F POVERT UNEMP
## 19372.8 -2244.4 3251.1 826.2 -610.6 1513.0
stepAIC(model_, direction = "both")
## Start: AIC=976.95
## INCOME ~ AREA + POP80 + POP92 + POPDEN + CRIME + BORN_F + POVERT +
## UNEMP + TEMPER
##
## Df Sum of Sq RSS AIC
## - CRIME 1 389253 161343069 975.10
## - POPDEN 1 392177 161345993 975.10
## - AREA 1 392728 161346544 975.10
## - POP92 1 399383 161353199 975.11
## - TEMPER 1 1868458 162822274 975.70
## - UNEMP 1 4332343 165286159 976.67
## - BORN_F 1 4971076 165924892 976.92
## <none> 160953816 976.95
## - POP80 1 12214364 173168180 979.70
## - POVERT 1 282102879 443056695 1040.76
##
## Step: AIC=975.1
## INCOME ~ AREA + POP80 + POP92 + POPDEN + BORN_F + POVERT + UNEMP +
## TEMPER
##
## Df Sum of Sq RSS AIC
## - POPDEN 1 433672 161776741 973.28
## - AREA 1 434175 161777244 973.28
## - POP92 1 440883 161783952 973.28
## - TEMPER 1 1604802 162947871 973.75
## - UNEMP 1 4814311 166157380 975.01
## <none> 161343069 975.10
## - BORN_F 1 6756546 168099615 975.77
## + CRIME 1 389253 160953816 976.95
## - POP80 1 11885594 173228663 977.72
## - POVERT 1 291537007 452880076 1040.19
##
## Step: AIC=973.28
## INCOME ~ AREA + POP80 + POP92 + BORN_F + POVERT + UNEMP + TEMPER
##
## Df Sum of Sq RSS AIC
## - AREA 1 1749358 163526099 971.98
## - TEMPER 1 1959695 163736436 972.06
## <none> 161776741 973.28
## - UNEMP 1 5356914 167133655 973.39
## - BORN_F 1 7249116 169025857 974.13
## + POPDEN 1 433672 161343069 975.10
## + CRIME 1 430748 161345993 975.10
## - POP80 1 11468717 173245458 975.73
## - POP92 1 16568534 178345275 977.61
## - POVERT 1 298552628 460329369 1039.25
##
## Step: AIC=971.98
## INCOME ~ POP80 + POP92 + BORN_F + POVERT + UNEMP + TEMPER
##
## Df Sum of Sq RSS AIC
## - TEMPER 1 3558242 167084341 971.38
## <none> 163526099 971.98
## - UNEMP 1 6410585 169936684 972.48
## + AREA 1 1749358 161776741 973.28
## + POPDEN 1 1748855 161777244 973.28
## - POP80 1 9779136 173305235 973.75
## + CRIME 1 55035 163471064 973.95
## - BORN_F 1 15146452 178672551 975.73
## - POP92 1 15318234 178844333 975.80
## - POVERT 1 296876293 460402392 1037.26
##
## Step: AIC=971.38
## INCOME ~ POP80 + POP92 + BORN_F + POVERT + UNEMP
##
## Df Sum of Sq RSS AIC
## <none> 167084341 971.38
## - POP80 1 6384425 173468766 971.81
## + TEMPER 1 3558242 163526099 971.98
## - UNEMP 1 6857984 173942325 971.99
## + AREA 1 3347905 163736436 972.06
## + POPDEN 1 3346997 163737344 972.06
## + CRIME 1 95275 166989066 973.34
## - POP92 1 11761265 178845606 973.80
## - BORN_F 1 19727125 186811466 976.63
## - POVERT 1 329636555 496720896 1040.19
##
## Call:
## lm(formula = INCOME ~ POP80 + POP92 + BORN_F + POVERT + UNEMP,
## data = df, na.action = na.exclude)
##
## Coefficients:
## (Intercept) POP80 POP92 BORN_F POVERT UNEMP
## 19372.8 -2244.4 3251.1 826.2 -610.6 1513.0
modelAIC_ <- lm(INCOME ~ POP80 + POP92 + BORN_F + POVERT + UNEMP, data = df, na.action = na.exclude)
modelAIC <- lm.beta(modelAIC_)
summary(modelAIC)
##
## Call:
## lm(formula = INCOME ~ POP80 + POP92 + BORN_F + POVERT + UNEMP,
## data = df, na.action = na.exclude)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4529.1 -1133.5 -68.6 816.4 3585.5
##
## Coefficients:
## Estimate Standardized Std. Error t value Pr(>|t|)
## (Intercept) 19372.7768 NA 4684.0911 4.136 0.000114 ***
## POP80 -2244.4335 -0.4029 1494.8160 -1.501 0.138565
## POP92 3251.1048 0.5559 1595.3127 2.038 0.046050 *
## BORN_F 826.1516 0.1876 313.0183 2.639 0.010610 *
## POVERT -610.5795 -0.8562 56.5935 -10.789 1.39e-15 ***
## UNEMP 1513.0207 0.1193 972.2736 1.556 0.125017
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1683 on 59 degrees of freedom
## Multiple R-squared: 0.8203, Adjusted R-squared: 0.8051
## F-statistic: 53.87 on 5 and 59 DF, p-value: < 2.2e-16
AIC(modelAIC)
## [1] 1157.837
# plot_ellipse(modelAIC, c(1, 2))
# plot_ellipse(modelAIC, c(1, 3))
# plot_ellipse(modelAIC, c(1, 4))
# plot_ellipse(modelAIC, c(1, 5))
plot_ellipse(modelAIC_, c(2, 3))
plot_ellipse(modelAIC_, c(2, 4))
plot_ellipse(modelAIC_, c(2, 5))
plot_ellipse(modelAIC_, c(3, 4))
plot_ellipse(modelAIC_, c(3, 5))
plot_ellipse(modelAIC_, c(4, 5))
hist(residuals(model))
ols_plot_resid_stud_fit(modelAIC_)
print(df_names[c(15, 27, 38, 56, 52), ])
## # A tibble: 5 × 12
## CITY STATE AREA POP80 POP92 POPDEN CRIME BORN_F POVERT INCOME UNEMP TEMPER
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 WASHIN… DC 4.12 13.4 13.3 9.16 9.28 2.27 16.9 30727 2.04 80
## 2 LONG_B… CA 3.91 12.8 13.0 9.08 9.12 3.19 16.8 31938 2.04 73.1
## 3 MIAMI FL 3.57 12.8 12.8 9.24 9.82 4.09 31.2 16925 2.37 82.6
## 4 ST__PE… FL 4.08 12.4 12.4 8.29 9.31 1.97 13.6 23577 1.97 83.1
## 5 NEWARK NJ 3.17 12.7 12.5 9.33 9.60 2.93 26.3 21650 2.53 77.8
ols_plot_resid_stud(modelAIC_)
print(df_names[56, ])
## # A tibble: 1 × 12
## CITY STATE AREA POP80 POP92 POPDEN CRIME BORN_F POVERT INCOME UNEMP TEMPER
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ST__PE… FL 4.08 12.4 12.4 8.29 9.31 1.97 13.6 23577 1.97 83.1
ggplot(data_frame(residuals = rstandard(modelAIC), studres = studres(modelAIC)), aes(x = residuals, y = studres)) +
geom_point() +
geom_abline(slope = 1, intercept = 0, color = "red")
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## ℹ Please use `tibble()` instead.
ggplot(data_frame(residuals = rstandard(modelAIC), studres = studres(modelAIC)), aes(x = residuals, y = studres - residuals)) +
geom_point() +
geom_abline(slope = 0, intercept = 0, color = "red")
plot(mahalanobis_distance(df)$mahal.dist)
plot(cooks.distance(modelAIC))
df_names[which(cooks.distance(modelAIC) > 0.1), ]
## # A tibble: 3 × 12
## CITY STATE AREA POP80 POP92 POPDEN CRIME BORN_F POVERT INCOME UNEMP TEMPER
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 TUCSON AZ 5.05 12.7 12.9 7.88 9.25 2.37 20.2 21748 1.36 86.6
## 2 MIAMI FL 3.57 12.8 12.8 9.24 9.82 4.09 31.2 16925 2.37 82.6
## 3 ST__PE… FL 4.08 12.4 12.4 8.29 9.31 1.97 13.6 23577 1.97 83.1
…………………..